import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from sklearn.model_selection import train_test_split
from sklearn.linear_model import PassiveAggressiveRegressor
from warnings import filterwarnings
filterwarnings("ignore")
df = pd.read_csv("My Instagram data.csv")
df.head()
| ID | Month | Year | category | post category | Impressions | From Home | From Hashtags | From Explore | From Other | Saves | Comments | Shares | Likes | Profile Visits | Follows | Hashtags | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | Jan | 2021 | forts | photo | 5716 | 5050 | 108 | 72 | 423 | 144 | 1 | 250 | 688 | 21 | 4 | #gadkille2021_maharahtra #shivray_samrajy #mar... |
| 1 | 2 | Jan | 2021 | history | video | 5325 | 4916 | 20 | 50 | 323 | 53 | 9 | 47 | 786 | 19 | 0 | #gadkille2021_maharahtra #shivray_samrajy #mar... |
| 2 | 3 | Jan | 2021 | facts | reel | 7091 | 6060 | 50 | 40 | 939 | 73 | 2 | 98 | 898 | 15 | 0 | #gadkille2021_maharahtra #shivray_samrajy #mar... |
| 3 | 4 | Jan | 2021 | shivaji maharaj | photo | 4520 | 3491 | 698 | 194 | 85 | 25 | 1 | 93 | 575 | 5 | 2 | #gadkille2021_maharahtra #shivray_samrajy #mar... |
| 4 | 5 | Jan | 2021 | sambhaji maharaj | photo | 6257 | 3269 | 2706 | 350 | 196 | 22 | 1 | 353 | 327 | 23 | 2 | #dasra #vijayadashami #dashera #gadkille2021_m... |
df.tail()
| ID | Month | Year | category | post category | Impressions | From Home | From Hashtags | From Explore | From Other | Saves | Comments | Shares | Likes | Profile Visits | Follows | Hashtags | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 245 | 246 | Nov | 2022 | forts | video | 4681 | 890 | 2379 | 966 | 10 | 9 | 6 | 9 | 382 | 7 | 6 | #gadkille2021_maharahtra�#marathaempire�#shiva... |
| 246 | 247 | Dec | 2022 | history | reel | 19066 | 428 | 501 | 17291 | 457 | 166 | 8 | 279 | 1097 | 100 | 22 | #gadkille2021_maharahtra�#marathaempire�#shiva... |
| 247 | 248 | Dec | 2022 | sambhaji maharaj | photo | 3988 | 340 | 501 | 2919 | 3 | 7 | 5 | 0 | 301 | 4 | 4 | #gadkille2021_maharahtra�#marathaempire�#shiva... |
| 248 | 249 | Dec | 2022 | shivaji maharaj | photo | 6124 | 572 | 2398 | 2710 | 440 | 73 | 1 | 61 | 352 | 5 | 4 | #gadkille2021_maharahtra�#marathaempire�#shiva... |
| 249 | 250 | Dec | 2022 | shivaji maharaj | video | 15187 | 1881 | 1989 | 10504 | 811 | 85 | 5 | 586 | 738 | 11 | 8 | #gadkille2021_maharahtra�#marathaempire�#shiva... |
df.shape
(250, 17)
df.columns
Index(['ID ', 'Month', 'Year', 'category', 'post category', 'Impressions',
'From Home', 'From Hashtags', 'From Explore', 'From Other', 'Saves',
'Comments', 'Shares', 'Likes', 'Profile Visits', 'Follows', 'Hashtags'],
dtype='object')
df.min()
ID 1 Month Apr Year 2021 category facts post category photo Impressions 1121 From Home 0 From Hashtags 0 From Explore 0 From Other 0 Saves 4 Comments 0 Shares 0 Likes 72 Profile Visits 3 Follows 0 Hashtags #dasra #vijayadashami #dashera #gadkille2021_m... dtype: object
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 250 entries, 0 to 249 Data columns (total 17 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 ID 250 non-null int64 1 Month 250 non-null object 2 Year 250 non-null int64 3 category 250 non-null object 4 post category 250 non-null object 5 Impressions 250 non-null int64 6 From Home 250 non-null int64 7 From Hashtags 250 non-null int64 8 From Explore 250 non-null int64 9 From Other 250 non-null int64 10 Saves 250 non-null int64 11 Comments 250 non-null int64 12 Shares 250 non-null int64 13 Likes 250 non-null int64 14 Profile Visits 250 non-null int64 15 Follows 250 non-null int64 16 Hashtags 250 non-null object dtypes: int64(13), object(4) memory usage: 33.3+ KB
df.describe()
| ID | Year | Impressions | From Home | From Hashtags | From Explore | From Other | Saves | Comments | Shares | Likes | Profile Visits | Follows | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 250.000000 | 250.000000 | 2.500000e+02 | 250.000000 | 250.000000 | 2.500000e+02 | 250.000000 | 250.000000 | 250.0000 | 250.000000 | 250.000000 | 250.000000 | 250.000000 |
| mean | 125.500000 | 2021.556000 | 4.662407e+04 | 3462.752000 | 1927.016000 | 3.921094e+04 | 1281.916000 | 310.364000 | 8.3120 | 217.508000 | 1993.272000 | 156.984000 | 75.560000 |
| std | 72.312977 | 0.497851 | 2.658521e+05 | 2765.502797 | 1867.021699 | 2.630805e+05 | 5089.363687 | 1561.463269 | 21.0503 | 1190.780387 | 7717.160361 | 1025.325745 | 440.689682 |
| min | 1.000000 | 2021.000000 | 1.121000e+03 | 0.000000 | 0.000000 | 0.000000e+00 | 0.000000 | 4.000000 | 0.0000 | 0.000000 | 72.000000 | 3.000000 | 0.000000 |
| 25% | 63.250000 | 2021.000000 | 4.542250e+03 | 2032.500000 | 569.250000 | 2.985000e+02 | 75.250000 | 52.000000 | 2.0000 | 7.000000 | 178.000000 | 16.000000 | 4.000000 |
| 50% | 125.500000 | 2022.000000 | 7.468000e+03 | 2749.000000 | 1352.000000 | 1.723500e+03 | 245.000000 | 95.500000 | 5.0000 | 26.500000 | 546.000000 | 29.500000 | 12.000000 |
| 75% | 187.750000 | 2022.000000 | 1.984100e+04 | 4187.750000 | 2778.000000 | 1.137450e+04 | 736.500000 | 198.500000 | 8.0000 | 135.250000 | 1324.000000 | 69.750000 | 28.000000 |
| max | 250.000000 | 2022.000000 | 4.022772e+06 | 33595.000000 | 11817.000000 | 3.979419e+06 | 53442.000000 | 24225.000000 | 231.0000 | 18424.000000 | 112324.000000 | 15793.000000 | 6599.000000 |
df.isnull().sum()
ID 0 Month 0 Year 0 category 0 post category 0 Impressions 0 From Home 0 From Hashtags 0 From Explore 0 From Other 0 Saves 0 Comments 0 Shares 0 Likes 0 Profile Visits 0 Follows 0 Hashtags 0 dtype: int64
df["category"].unique()
array(['forts', 'history', 'facts', 'shivaji maharaj', 'sambhaji maharaj'],
dtype=object)
df.hist()
array([[<AxesSubplot:title={'center':'ID '}>,
<AxesSubplot:title={'center':'Year'}>,
<AxesSubplot:title={'center':'Impressions'}>,
<AxesSubplot:title={'center':'From Home'}>],
[<AxesSubplot:title={'center':'From Hashtags'}>,
<AxesSubplot:title={'center':'From Explore'}>,
<AxesSubplot:title={'center':'From Other'}>,
<AxesSubplot:title={'center':'Saves'}>],
[<AxesSubplot:title={'center':'Comments'}>,
<AxesSubplot:title={'center':'Shares'}>,
<AxesSubplot:title={'center':'Likes'}>,
<AxesSubplot:title={'center':'Profile Visits'}>],
[<AxesSubplot:title={'center':'Follows'}>, <AxesSubplot:>,
<AxesSubplot:>, <AxesSubplot:>]], dtype=object)
plt.figure(figsize=(10, 8))
sns.set(style="darkgrid")
plt.title("Distribution of Impressions From Home")
sns.distplot(df['From Home'])
plt.show()
plt.figure(figsize=(10, 8))
plt.title("Distribution of Impressions From Hashtags")
sns.distplot(df['From Hashtags'])
plt.show()
plt.figure(figsize=(10, 8))
plt.title("Distribution of Impressions From Explore")
sns.distplot(df['From Explore'])
plt.show()
home = df["From Home"].sum()
hashtags = df["From Hashtags"].sum()
explore = df["From Explore"].sum()
other = df["From Other"].sum()
labels = ['From Home','From Hashtags','From Explore','Other']
values = [home, hashtags, explore, other]
fig = px.pie(df, values=values, names=labels,
title='Impressions on Instagram Posts From Various Sources', hole=0.5)
fig.show()
'''df = pd.DataFrame({
"post category": ["forts", "history", "facts", "shivaji maharaj", "sambhaji maharaj"],
"impressions": [100, 200, 150, 300, 600]
})'''
labels = df["category"].tolist()
values = df["Impressions"].tolist()
fig = px.pie(values=values, names=labels,
title='Impressions on Instagram Posts From Post Categories', hole=0.5)
fig.show()
labels = df["category"].tolist()
Likes = df["Likes"].tolist()
fig = px.pie(values=Likes, names=labels,
title='Likes on Instagram Posts From Post Categories', hole=0.5)
fig.show()
'''df = pd.DataFrame({
"category": ["A", "B", "C", "D", "E"],
"count": [100, 200, 150, 300, 250]
})'''
fig = px.bar(df, x="category", y="Impressions",
title="Relationship between Categories and impressions",
color="category")
fig.show()
fig = px.bar(df, x="post category", y="Impressions",
title="Relationship between Categories and Counts",
color="category")
fig.show()
fig = px.bar(df, x="category", y="Likes",
title="Relationship between Categories and likes",
color="category")
fig.show()
fig = px.bar(df, x="category", y="Follows",
title="Relationship between Categories and Followers",
color="category")
fig.show()
pip install --upgrade scipy
Requirement already satisfied: scipy in c:\anaconda\lib\site-packages (1.10.1) Requirement already satisfied: numpy<1.27.0,>=1.19.5 in c:\anaconda\lib\site-packages (from scipy) (1.23.5) Note: you may need to restart the kernel to use updated packages.
import scipy
print(scipy.__version__)
1.10.1
from scipy.signal.signaltools import _centered
##### import scipy.signal.signaltools
def _centered(arr, newsize):
# Return the center newsize portion of the array.
newsize = np.asarray(newsize)
currsize = np.array(arr.shape)
startind = (currsize - newsize) // 2
endind = startind + newsize
myslice = [slice(startind[k], endind[k]) for k in range(len(endind))]
return arr[tuple(myslice)]
scipy.signal.signaltools._centered = _centered
figure = px.scatter(data_frame = df, x="Impressions",
y="Likes", size="Likes", trendline="ols",
title = "Relationship Between Likes and Impressions")
figure.show()
figure = px.scatter(data_frame = df, x="Impressions",
y="Comments", size="Comments", trendline="ols",
title = "Relationship Between Comments and Total Impressions")
figure.show()
figure = px.scatter(data_frame = df, x="Impressions",
y="Shares", size="Shares", trendline="ols",
title = "Relationship Between Shares and Total Impressions")
figure.show()
figure = px.scatter(data_frame = df, x="Impressions",
y="Saves", size="Saves", trendline="ols",
title = "Relationship Between Post Saves and Total Impressions")
figure.show()
figure = px.scatter(data_frame = df, x="Follows",
y="Likes", size="Likes", trendline="ols",
title = "Relationship Between Post Likes and Followers Gained")
figure.show()
figure = px.scatter(data_frame = df, x="Follows",
y="Shares", size="Shares", trendline="ols",
title = "Relationship Between Post Shares and Followers Gained")
figure.show()
figure = px.scatter(data_frame = df, x="Follows",
y="Comments", size="Comments", trendline="ols",
title = "Relationship Between Post Likes and Followers Gained")
figure.show()
correlation = df.corr()
print(correlation["Impressions"].sort_values(ascending=False))
Impressions 1.000000 From Explore 0.999263 Profile Visits 0.988828 Follows 0.987860 Saves 0.985767 Likes 0.979050 Shares 0.975922 Comments 0.712542 From Other 0.520403 ID 0.200225 Year 0.138793 From Hashtags 0.027207 From Home 0.013366 Name: Impressions, dtype: float64
So we can say that more saves and likes will help you get more reach on Instagram. The higher number of shares will also help you get more reach, but a low number of comments will not affect your reach either.
Also more saves, likes and shares will help you get more followers on instagram
In Instagram, conversation rate means how many followers you are getting from the number of profile visits from a post.
The formula that you can use to calculate conversion rate is (Follows/Profile Visits) * 100.
conversion_rate = (df["Follows"].sum() / df["Profile Visits"].sum()) * 100
print(conversion_rate)
48.13229373694134
conversion_rate = (df["Follows"].sum() / df["Impressions"].sum()) * 100
print(conversion_rate)
0.16206220683598807
conversion_rate = (df["Follows"].sum() / df["Likes"].sum()) * 100
print(conversion_rate)
3.790752090030864
conversion_rate = (df["Follows"].sum() / df["Shares"].sum()) * 100
print(conversion_rate)
34.73895213049635
figure = px.scatter(data_frame = df, x="Profile Visits",
y="Follows", size="Follows", trendline="ols",
title = "Relationship Between Profile Visits and Followers Gained")
figure.show()
x = np.array(df[['Likes', 'Saves', 'Comments', 'Shares',
'Profile Visits', 'Follows']])
y = np.array(df["Impressions"])
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.2, random_state=42)
model = PassiveAggressiveRegressor()
model.fit(xtrain, ytrain)
model.score(xtest, ytest)
0.7463942010203388
# Features = [['Likes','Saves', 'Comments', 'Shares', 'Profile Visits', 'Follows']]
features = np.array([[282.0, 233.0, 4.0, 9.0, 165.0, 54.0]])
model.predict(features)
array([11833.40089692])